data = read.csv("stock_price.csv")
pca_stocks=prcomp(data, scale=TRUE)
plot(pca_stocks,main="Stocks Principal Component Analysis") ## same as screeplot(pcafood)
mtext(side=1, "Stocks Principal Components", line=1, font=2)
stocks = predict(pca_stocks)
plot(pca_stocks$rotation[,1],type='l', main = "The loadings for first principal component")
2) Generate scatter plots for principal component 1 and principal component 2 ## Principal Component 1 vs Principal Component 2
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
loadings=as.vector(pca_stocks[2]$rotation[,c("PC1")])
stocks=predict(pca_stocks)
stocks=as.data.frame.matrix(stocks)
library(car)
## Loading required package: carData
plot(stocks[,c("PC1","PC2")], xlab="Principal Component 1", ylab="Principal Component 2", main="PC1 vs PC2")
3) Generate an MDS plot:
labels=seq(1, 127, by=1)
new_data=stocks[,c("PC1","PC2")];
data.dist=dist(stocks)
data.mds <- cmdscale(data.dist)
plot(data.mds, type = 'n')
text(data.mds)
biplot(pca_stocks)
par(mfrow=c(2,2))
plot(pca_stocks$x[,1],xlab="index",ylab="PC1",main="Principal Component 1")
plot(pca_stocks$x[,2],xlab="index",ylab="PC2",main="Principal Component 2")
plot(pca_stocks$x[,3],xlab="index",ylab="PC3",main="Principal Component 3")
plot(pca_stocks$x[,4],xlab="index",ylab="PC4",main="Principal Component 4")
## Creating functions for Kmeans Clustering, Hierarchial Clustering
#
#
#
#
library("ape")
library("zoom")
do.kmeans <- function(data,labels,k=3,do.scatter=F) {
heading=paste(c("K-means with Clusters", k), collapse = " ")
print(heading)
set.seed(123)
data.clu = kmeans(data, centers=k, nstart=10)
if (do.scatter) {
plot(dataset,type='n')
text(dataset,labels=labels,col=rainbow(k)[data.clu$cluster])
}
print(data.clu)
data.clu
}
do.hclust <- function(data, methodName ='single',labels,k=3,do.dendrogram=F) {
heading=paste(c("Hierarchial Clustering with", k," Clusters and Method name",methodName), collapse = " ")
print(heading)
data.dist = dist(data)
hc = hclust(data.dist,method= methodName)
colors = c("red", "blue", "green")
clus = cutree(hc, 3)
if(k==2){
colors = c("red", "blue")
clus = cutree(hc, 2)
} else if(k==6){
colors = c("red", "blue", "green","purple", "orange", "green")
clus = cutree(hc, 6)
}
if (do.dendrogram) {
layout(matrix(c(1, 1, 1,
1, 1, 1,
1, 1, 1), nr=3, byrow=T))
title=paste(c("Dendogram with", k,"Clusters and method is", methodName), collapse = " ")
plot(as.phylo(hc), type = "fan",main=title, tip.color = colors[clus],label.offset = 1, cex = 0.9)
#zm();
}
hc1 = cutree(hc,k)
print(hc1)
hc1
}
do.mdsplot <- function(data,labels,clusters,methodName,clusteredlabels){
title=paste(c(methodName,"Clustering with", clusters," Cluster/Clusters"), collapse = " ")
plot(data, type = "n",ylim=c(-10,150),main=title)
text(data[,1], labels, col = rainbow(clusters)[clusteredlabels])
}
clu3_kmeans = do.kmeans(data, labels, k = 3)$cluster
## [1] "K-means with Clusters 3"
## K-means clustering with 3 clusters of sizes 37, 65, 25
##
## Cluster means:
## AA AXP BA BAC CAT CSCO CVX
## 1 16.20000 44.78486 70.50784 13.92324 97.37865 19.48216 95.71838
## 2 16.73554 47.73769 74.81431 12.58369 106.14031 17.10477 103.95338
## 3 0.01960 -0.22440 -0.35400 0.12400 -0.08480 0.22320 -0.29480
## DD DIS GE HD HPQ IBM INTC JNJ
## 1 50.88703 40.16486 19.59081 36.03676 44.08514 156.8776 20.99757 61.27676
## 2 53.74769 41.70738 19.88554 36.85415 40.05969 166.1420 21.75446 63.41246
## 3 -0.16680 -0.05520 0.05040 -0.02680 0.20040 -0.8992 -0.03080 -0.15920
## JPM KRFT KO MCD MMM MRK MSFT PFE
## 1 44.39514 63.31865 31.00378 75.00649 88.79135 33.80378 26.99838 18.82270
## 2 44.45154 66.50323 33.21785 78.68800 93.31262 34.74800 25.51031 20.35154
## 3 0.03320 -0.04400 -0.17400 -0.14840 -0.16800 0.01960 0.12560 -0.10560
## PG T TRV UTX VZ WMT XOM
## 1 63.64432 28.37946 56.51405 80.98784 35.70378 54.19054 79.48324
## 2 64.04800 30.34000 60.56123 85.56692 36.94031 53.73800 83.33677
## 3 0.01680 -0.01400 -0.04960 -0.23680 0.05120 -0.04040 -0.15320
##
## Clustering vector:
## [1] 1 1 1 1 3 1 1 1 1 3 1 1 1 1 3 1 1 1 1 3 1 1 1 1 3 1 2 1 1 3 1 2 1 2 3
## [36] 2 2 1 2 3 2 2 1 2 3 2 2 1 1 3 1 2 1 1 3 1 2 1 2 3 2 2 2 2 3 2 2 2 2 3
## [71] 2 2 2 2 3 2 2 2 2 3 2 2 2 2 3 2 2 2 2 3 2 2 2 2 3 2 2 2 2 3 2 2 2 2 3
## [106] 2 2 2 2 3 2 2 1 2 3 2 2 1 2 3 2 2 1 2 3 2 2
##
## Within cluster sum of squares by cluster:
## [1] 5449.840 9493.242 1516.377
## (between_SS / total_SS = 99.3 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
clu6_kmeans = do.kmeans(data, labels, k = 6)$cluster
## [1] "K-means with Clusters 6"
## K-means clustering with 6 clusters of sizes 20, 18, 18, 28, 25, 18
##
## Cluster means:
## AA AXP BA BAC CAT CSCO CVX
## 1 17.26700 46.05800 73.76800 13.32200 109.42500 17.40650 107.52100
## 2 16.07444 44.62944 69.62222 14.22444 94.28667 20.65333 92.77667
## 3 17.13667 50.39444 77.72944 12.04722 109.36667 17.03667 104.62167
## 4 16.69821 44.67143 71.49393 14.19250 102.46214 18.83464 99.96929
## 5 0.01960 -0.22440 -0.35400 0.12400 -0.08480 0.22320 -0.29480
## 6 15.36222 48.75556 74.56667 10.91000 98.82944 15.48500 99.76778
## DD DIS GE HD HPQ IBM INTC JNJ
## 1 55.09850 42.50850 20.22500 37.65350 41.28000 164.8375 20.48550 60.67100
## 2 49.37389 39.13000 19.11278 35.70500 45.10111 151.9250 20.90111 61.73167
## 3 54.30444 42.23444 19.98056 36.98167 39.44111 170.2706 23.08444 66.26111
## 4 53.55536 42.25036 20.53143 37.23929 44.70250 162.1479 21.22071 60.23857
## 5 -0.16680 -0.05520 0.05040 -0.02680 0.20040 -0.8992 -0.03080 -0.15920
## 6 50.48278 38.85222 18.57556 34.70833 35.33333 164.8494 21.96222 65.83778
## JPM KRFT KO MCD MMM MRK MSFT PFE
## 1 46.01900 66.87050 32.18000 76.48400 93.28500 33.52650 25.77150 20.32100
## 2 44.21944 63.04611 31.08278 74.33222 87.47667 34.60222 27.90556 18.25278
## 3 44.12222 67.60278 34.50333 80.67278 95.60222 36.56056 25.47000 20.87778
## 4 45.85571 64.00964 31.22821 75.24179 90.85393 32.77750 26.52429 19.32679
## 5 0.03320 -0.04400 -0.17400 -0.14840 -0.16800 0.01960 0.12560 -0.10560
## 6 40.97111 65.78556 33.76444 81.30111 91.42056 35.56278 24.34667 20.40944
## PG T TRV UTX VZ WMT XOM
## 1 62.48300 30.12200 59.91950 84.92150 37.63450 52.92450 85.05900
## 2 64.38778 28.30278 54.89833 79.55000 35.47944 54.80667 77.24333
## 3 66.03389 31.44611 62.62222 88.59222 37.41278 55.10167 83.45667
## 4 62.99536 28.32429 58.84250 82.81607 36.21464 53.66679 83.28786
## 5 0.01680 -0.01400 -0.04960 -0.23680 0.05120 -0.04040 -0.15320
## 6 64.26889 30.61889 59.23056 84.14222 35.74444 53.25056 79.55167
##
## Clustering vector:
## [1] 2 2 2 2 5 2 2 2 2 5 2 2 2 2 5 2 4 2 2 5 2 4 2 4 5 4 4 4 4 5 4 4 4 4 5
## [36] 4 4 4 4 5 4 1 4 4 5 4 1 4 4 5 4 4 2 4 5 4 1 4 1 5 1 1 1 1 5 1 1 1 1 5
## [71] 1 1 4 1 5 1 1 4 1 5 1 3 1 3 5 3 3 3 3 5 3 3 3 3 5 3 3 6 3 5 6 3 6 3 5
## [106] 3 3 6 6 5 6 6 6 6 5 6 6 6 6 5 6 6 6 6 5 6 3
##
## Within cluster sum of squares by cluster:
## [1] 730.2266 1130.2217 1212.3933 1935.9846 1516.3769 791.3674
## (between_SS / total_SS = 99.7 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
do.mdsplot(data=data.mds,labels = labels,clusters=3,methodName = "Kmeans",clusteredlabels = clu3_kmeans)
do.mdsplot(data=data.mds,labels = labels,clusters=6,methodName = "Kmeans",clusteredlabels = clu6_kmeans)
clu3_hclust_single = do.hclust(data.mds, methodName = "single",labels, k = 3, do.dendrogram = T)
## [1] "Hierarchial Clustering with 3 Clusters and Method name single"
## [1] 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2
## [36] 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 3 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2
## [71] 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2
## [106] 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1
clu6_hclust_single = do.hclust(data.mds, methodName = "single",labels, k = 6, do.dendrogram = T)
## [1] "Hierarchial Clustering with 6 Clusters and Method name single"
## [1] 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 3 1 1 2 1 1 1 1 2
## [36] 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 4 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2
## [71] 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 5 1 1 2 1 1 1 1 2
## [106] 1 1 1 1 6 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1
clu3_hclust_complete = do.hclust(data.mds, methodName = "complete",labels, k = 3, do.dendrogram = T)
## [1] "Hierarchial Clustering with 3 Clusters and Method name complete"
## [1] 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2
## [36] 1 1 1 3 2 3 3 1 3 2 3 3 1 3 2 1 3 1 1 2 1 3 1 3 2 3 3 3 3 2 3 3 3 3 2
## [71] 3 3 3 3 2 3 3 3 3 2 3 3 3 3 2 3 3 3 3 2 3 3 3 3 2 3 3 3 3 2 3 3 3 3 2
## [106] 3 3 3 3 2 3 3 3 3 2 3 3 3 3 2 3 3 3 3 2 3 3
clu6_hclust_complete = do.hclust(data.mds, methodName = "complete",labels, k = 6, do.dendrogram = T)
## [1] "Hierarchial Clustering with 6 Clusters and Method name complete"
## [1] 1 2 1 1 3 1 2 1 2 3 2 2 1 2 3 2 2 1 2 3 2 2 1 2 3 2 2 2 2 3 2 2 2 2 3
## [36] 2 2 1 4 3 4 4 1 4 3 4 4 1 4 3 1 4 1 1 3 1 4 1 4 3 4 4 4 4 3 4 4 4 4 3
## [71] 4 4 4 4 3 4 5 4 5 3 5 5 4 5 3 5 5 5 5 3 5 5 5 5 3 5 5 6 6 3 6 6 6 6 3
## [106] 6 5 6 6 3 6 6 6 6 3 6 6 6 6 3 6 6 6 6 3 6 6
clu3_hclust_average = do.hclust(data.mds, methodName = "average",labels, k = 3, do.dendrogram = T)
## [1] "Hierarchial Clustering with 3 Clusters and Method name average"
## [1] 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2
## [36] 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2 1 1 1 1 2
## [71] 1 1 1 1 2 1 3 1 3 2 3 3 1 3 2 3 3 3 3 2 3 3 3 3 2 3 3 3 3 2 3 3 3 3 2
## [106] 3 3 3 3 2 3 3 3 3 2 3 3 3 3 2 3 3 3 3 2 3 3
clu6_hclust_average = do.hclust(data.mds, methodName = "average",labels, k = 6, do.dendrogram = T)
## [1] "Hierarchial Clustering with 6 Clusters and Method name average"
## [1] 1 2 2 2 3 2 2 2 2 3 2 2 2 2 3 2 2 2 2 3 2 2 2 2 3 2 2 2 2 3 2 2 2 2 3
## [36] 2 2 1 1 3 1 1 1 1 3 1 1 1 1 3 1 1 4 1 3 1 1 1 1 3 1 1 1 1 3 1 1 1 1 3
## [71] 1 1 1 1 3 1 5 1 5 3 5 5 1 5 3 5 5 5 5 3 5 5 5 5 3 5 5 6 5 3 6 5 6 5 3
## [106] 5 5 6 6 3 6 6 6 6 3 6 6 6 6 3 6 6 6 6 3 6 5
do.mdsplot(data=data.mds,labels = labels,clusters=3,methodName = "HClust-single",clusteredlabels = clu3_hclust_single)
do.mdsplot(data=data.mds,labels = labels,clusters=6,methodName = "HClust-single",clusteredlabels = clu6_hclust_single)
do.mdsplot(data=data.mds,labels = labels,clusters=3,methodName = "HClust-complete" ,clusteredlabels = clu3_hclust_complete)
do.mdsplot(data=data.mds,labels = labels,clusters=6,methodName = "HClust-complete" ,clusteredlabels = clu6_hclust_complete)
do.mdsplot(data=data.mds,labels = labels,clusters=3,methodName = "HClust-average" ,clusteredlabels = clu3_hclust_average)
do.mdsplot(data=data.mds,labels = labels,clusters=6,methodName = "HClust-average" ,clusteredlabels = clu6_hclust_average)
# Senator Data
library("foreign")
raw_data=read.dta("sen113kh.dta")
data=read.dta("sen113kh.dta")
data=data[,10:length(colnames(data))]
#sen_data=prcomp(data, scale=TRUE)
#plot(sen_data,main="113th Congress Data") ## same as screeplot(pcafood)
#mtext(side=1, "Principal Components", line=1, font=2)
new_data=data$x[,1:2]
data.dist=dist(data)
data.mds <- cmdscale(data.dist)
do.mdsplot(data=data.mds,labels = labels,clusters=2,methodName = "Democrats and Republicans",clusteredlabels = as.matrix(raw_data["party"]/100))
clu2_hclust_single = do.hclust(data.mds, methodName = "single",labels, k = 2, do.dendrogram = T)
## [1] "Hierarchial Clustering with 2 Clusters and Method name single"
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
clu2_hclust_complete = do.hclust(data.mds, methodName = "complete",labels, k = 2, do.dendrogram = T)
## [1] "Hierarchial Clustering with 2 Clusters and Method name complete"
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## 1 1 1 2 2 1 1 2 1 2 2 2 2 2 2 2 2 1
## 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
## 2 1 1 2 2 1 1 2 1 1 2 1 2 1 1 1 1 1
## 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
## 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 2 1 2
## 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
## 2 2 1 1 1 2 1 2 2 2 2 2 2 2 2 2 1 2
## 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
## 2 1 2 1 1 1 2 2 2 1 2 2 1 1 1 2 1 1
## 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106
## 1 1 1 1 2 2 2 2 2 2 2 2 1 2 1 1
clu2_hclust_average = do.hclust(data.mds, methodName = "average",labels, k = 2, do.dendrogram = T)
## [1] "Hierarchial Clustering with 2 Clusters and Method name average"
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106
## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
clu2_kmeans=do.kmeans(data.mds, labels, k = 2)$cluster
## [1] "K-means with Clusters 2"
## K-means clustering with 2 clusters of sizes 45, 61
##
## Cluster means:
## [,1] [,2]
## 1 -54.02422 0.03559439
## 2 39.85393 -0.02625816
##
## Clustering vector:
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## 1 1 1 1 2 1 1 2 1 2 2 2 2 2 2 2 2 1
## 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
## 2 1 1 2 2 1 1 2 1 1 2 1 2 1 1 1 1 1
## 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
## 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 2 1 2
## 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
## 2 2 1 1 1 2 1 2 2 2 2 2 2 2 2 2 1 2
## 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
## 2 1 2 1 1 1 2 2 2 1 2 2 1 1 1 2 1 1
## 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106
## 1 1 1 1 2 2 2 2 2 2 2 2 1 2 1 1
##
## Within cluster sum of squares by cluster:
## [1] 16329.72 15305.68
## (between_SS / total_SS = 87.8 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
Task 2: Analyze US Senator Roll Call Data. The objective is to identify and visualize the clustering patterns of senators voting activities.
do.mdsplot(data=data.mds,labels = labels,clusters=2,methodName = "HClust-single",clusteredlabels = clu2_hclust_single)
do.mdsplot(data=data.mds,labels = labels,clusters=2,methodName = "HClust-complete" ,clusteredlabels = clu2_hclust_complete)
do.mdsplot(data=data.mds,labels = labels,clusters=2,methodName = "HClust-average" ,clusteredlabels = clu2_hclust_average)
do.mdsplot(data=data.mds,labels = labels,clusters=2,methodName = "Kmeans",clusteredlabels = clu2_kmeans)
2) Use k-means and hierarchial clustering to group the senators and color the senators on the MDS plots based on the clustering results.
raw_data$cluster_kmeans=clu2_kmeans
raw_data$cluster_hclust_single=clu2_hclust_single
raw_data$cluster_hclust_complete=clu2_hclust_complete
raw_data$cluster_hclust_average=clu2_hclust_average
do.check <- function(party,cluster,clusterMethod="cluster_kmeans"){
TP=0;
TN=0;
FP=0;
FN=0;
n=nrow(raw_data);
for(x in 1:n){
if(raw_data[x,clusterMethod]==cluster && raw_data[x,"party"]==party){
TP=TP+1;
}
else if(raw_data[x,clusterMethod]!=cluster && raw_data[x,"party"]!=party){
TN=TN+1;
}
else if(raw_data[x,clusterMethod]==cluster && raw_data[x,"party"]!=party){
FP=FP+1;
print("Democrat conidered as a republican on index")
print(x)
}
else{
FN=FN+1;
print("Republican conidered as a democrat on index")
print(x)
}
}
val=c(TP,TN,FP,FN)
return(val)
}
#raw_data[c("cluster_kmeans","party")]
#cluster_kmeans=raw_data["cluster_kmeans"];
do.confusionMatrix <- function(cluster="cluster_kmeans"){
n=nrow(raw_data["party"]);
counter=0;
for(x in 1:n){
#print(x)
#print(raw_data[x,cluster]);
#print(raw_data[x,"party"]);
if(raw_data[x,cluster]==1 && raw_data[x,"party"]==100){
counter=counter+1;
}
else{
counter=counter-1;
}
}
if(counter>=0){
print("100 is Cluster 1")
conf_mat=do.check("100","1",clusterMethod=cluster)
} else{
print("100 is Cluster 2");
conf_mat=do.check("100","2",clusterMethod=cluster)
}
return(conf_mat)
}
conf_mat_kmeans=do.confusionMatrix("cluster_kmeans")
## [1] "100 is Cluster 2"
## [1] "Republican conidered as a democrat on index"
## [1] 1
## [1] "Democrat conidered as a republican on index"
## [1] 38
## [1] "Democrat conidered as a republican on index"
## [1] 39
## [1] "Democrat conidered as a republican on index"
## [1] 65
## [1] "Democrat conidered as a republican on index"
## [1] 95
conf_mat_kmeans
## [1] 57 44 4 1
conf_hclust_single=do.confusionMatrix("cluster_hclust_single")
## [1] "100 is Cluster 2"
## [1] "Republican conidered as a democrat on index"
## [1] 1
## [1] "Democrat conidered as a republican on index"
## [1] 2
## [1] "Democrat conidered as a republican on index"
## [1] 3
## [1] "Democrat conidered as a republican on index"
## [1] 4
## [1] "Democrat conidered as a republican on index"
## [1] 6
## [1] "Democrat conidered as a republican on index"
## [1] 7
## [1] "Democrat conidered as a republican on index"
## [1] 9
## [1] "Democrat conidered as a republican on index"
## [1] 18
## [1] "Democrat conidered as a republican on index"
## [1] 20
## [1] "Democrat conidered as a republican on index"
## [1] 21
## [1] "Democrat conidered as a republican on index"
## [1] 24
## [1] "Democrat conidered as a republican on index"
## [1] 25
## [1] "Democrat conidered as a republican on index"
## [1] 27
## [1] "Democrat conidered as a republican on index"
## [1] 28
## [1] "Democrat conidered as a republican on index"
## [1] 30
## [1] "Democrat conidered as a republican on index"
## [1] 32
## [1] "Democrat conidered as a republican on index"
## [1] 33
## [1] "Democrat conidered as a republican on index"
## [1] 34
## [1] "Democrat conidered as a republican on index"
## [1] 35
## [1] "Democrat conidered as a republican on index"
## [1] 36
## [1] "Democrat conidered as a republican on index"
## [1] 38
## [1] "Democrat conidered as a republican on index"
## [1] 39
## [1] "Democrat conidered as a republican on index"
## [1] 50
## [1] "Democrat conidered as a republican on index"
## [1] 51
## [1] "Democrat conidered as a republican on index"
## [1] 53
## [1] "Democrat conidered as a republican on index"
## [1] 57
## [1] "Democrat conidered as a republican on index"
## [1] 58
## [1] "Democrat conidered as a republican on index"
## [1] 59
## [1] "Democrat conidered as a republican on index"
## [1] 61
## [1] "Democrat conidered as a republican on index"
## [1] 65
## [1] "Democrat conidered as a republican on index"
## [1] 71
## [1] "Democrat conidered as a republican on index"
## [1] 74
## [1] "Democrat conidered as a republican on index"
## [1] 76
## [1] "Democrat conidered as a republican on index"
## [1] 77
## [1] "Democrat conidered as a republican on index"
## [1] 78
## [1] "Democrat conidered as a republican on index"
## [1] 82
## [1] "Democrat conidered as a republican on index"
## [1] 85
## [1] "Democrat conidered as a republican on index"
## [1] 86
## [1] "Democrat conidered as a republican on index"
## [1] 87
## [1] "Democrat conidered as a republican on index"
## [1] 89
## [1] "Democrat conidered as a republican on index"
## [1] 90
## [1] "Democrat conidered as a republican on index"
## [1] 91
## [1] "Democrat conidered as a republican on index"
## [1] 92
## [1] "Democrat conidered as a republican on index"
## [1] 93
## [1] "Democrat conidered as a republican on index"
## [1] 94
## [1] "Democrat conidered as a republican on index"
## [1] 95
## [1] "Democrat conidered as a republican on index"
## [1] 103
## [1] "Democrat conidered as a republican on index"
## [1] 105
## [1] "Democrat conidered as a republican on index"
## [1] 106
conf_hclust_single
## [1] 57 0 48 1
conf_hclust_complete=do.confusionMatrix("cluster_hclust_complete")
## [1] "100 is Cluster 2"
## [1] "Republican conidered as a democrat on index"
## [1] 1
## [1] "Democrat conidered as a republican on index"
## [1] 4
## [1] "Democrat conidered as a republican on index"
## [1] 38
## [1] "Democrat conidered as a republican on index"
## [1] 39
## [1] "Democrat conidered as a republican on index"
## [1] 65
## [1] "Democrat conidered as a republican on index"
## [1] 95
conf_hclust_complete
## [1] 57 43 5 1
conf_hclust_average=do.confusionMatrix("cluster_hclust_average")
## [1] "100 is Cluster 2"
## [1] "Republican conidered as a democrat on index"
## [1] 1
## [1] "Democrat conidered as a republican on index"
## [1] 2
## [1] "Democrat conidered as a republican on index"
## [1] 3
## [1] "Democrat conidered as a republican on index"
## [1] 4
## [1] "Democrat conidered as a republican on index"
## [1] 6
## [1] "Democrat conidered as a republican on index"
## [1] 7
## [1] "Democrat conidered as a republican on index"
## [1] 9
## [1] "Democrat conidered as a republican on index"
## [1] 18
## [1] "Democrat conidered as a republican on index"
## [1] 20
## [1] "Democrat conidered as a republican on index"
## [1] 21
## [1] "Democrat conidered as a republican on index"
## [1] 24
## [1] "Democrat conidered as a republican on index"
## [1] 25
## [1] "Democrat conidered as a republican on index"
## [1] 27
## [1] "Democrat conidered as a republican on index"
## [1] 28
## [1] "Democrat conidered as a republican on index"
## [1] 30
## [1] "Democrat conidered as a republican on index"
## [1] 32
## [1] "Democrat conidered as a republican on index"
## [1] 33
## [1] "Democrat conidered as a republican on index"
## [1] 34
## [1] "Democrat conidered as a republican on index"
## [1] 35
## [1] "Democrat conidered as a republican on index"
## [1] 36
## [1] "Democrat conidered as a republican on index"
## [1] 38
## [1] "Democrat conidered as a republican on index"
## [1] 39
## [1] "Democrat conidered as a republican on index"
## [1] 50
## [1] "Democrat conidered as a republican on index"
## [1] 51
## [1] "Democrat conidered as a republican on index"
## [1] 53
## [1] "Democrat conidered as a republican on index"
## [1] 57
## [1] "Democrat conidered as a republican on index"
## [1] 58
## [1] "Democrat conidered as a republican on index"
## [1] 59
## [1] "Democrat conidered as a republican on index"
## [1] 61
## [1] "Democrat conidered as a republican on index"
## [1] 65
## [1] "Democrat conidered as a republican on index"
## [1] 71
## [1] "Democrat conidered as a republican on index"
## [1] 74
## [1] "Democrat conidered as a republican on index"
## [1] 76
## [1] "Democrat conidered as a republican on index"
## [1] 77
## [1] "Democrat conidered as a republican on index"
## [1] 78
## [1] "Democrat conidered as a republican on index"
## [1] 82
## [1] "Democrat conidered as a republican on index"
## [1] 85
## [1] "Democrat conidered as a republican on index"
## [1] 86
## [1] "Democrat conidered as a republican on index"
## [1] 87
## [1] "Democrat conidered as a republican on index"
## [1] 89
## [1] "Democrat conidered as a republican on index"
## [1] 90
## [1] "Democrat conidered as a republican on index"
## [1] 91
## [1] "Democrat conidered as a republican on index"
## [1] 92
## [1] "Democrat conidered as a republican on index"
## [1] 93
## [1] "Democrat conidered as a republican on index"
## [1] 94
## [1] "Democrat conidered as a republican on index"
## [1] 95
## [1] "Democrat conidered as a republican on index"
## [1] 103
## [1] "Democrat conidered as a republican on index"
## [1] 105
## [1] "Democrat conidered as a republican on index"
## [1] 106
conf_hclust_average
## [1] 57 0 48 1
[1] Republican conidered as a democrat on index - 1 [2] Democrat conidered as a republican on index - 38 [3] Democrat conidered as a republican on index - 39 [4] Democrat conidered as a republican on index - 65 [5] Democrat conidered as a republican on index - 95
cluster.purity <- function(clusters, classes) {
sum(apply(table(classes, clusters), 2, max)) /
length(clusters)
}
cluster.entropy <- function(clusters,classes) {
en <- function(x) {
s = sum(x)
sum(sapply(x/s, function(p) {if (p) -p*log2(p)
else 0} ) )
}
M = table(classes, clusters)
m = apply(M, 2, en)
c = colSums(M) / sum(M)
sum(m*c)
}
kmeans_purity=cluster.purity(as.matrix(raw_data["cluster_kmeans"]),as.matrix(raw_data["party"]))
kmeans_entropy=cluster.entropy(as.matrix(raw_data["cluster_kmeans"]),as.matrix(raw_data["party"]))
kmeans=c(kmeans_purity,kmeans_entropy)
hclust_single_purity=cluster.purity(as.matrix(raw_data["cluster_hclust_single"]),as.matrix(raw_data["party"]))
hclust_single_entropy=cluster.entropy(as.matrix(raw_data["cluster_hclust_single"]),as.matrix(raw_data["party"]))
hclust_single=c(hclust_single_purity,hclust_single_entropy)
hclust_complete_purity=cluster.purity(as.matrix(raw_data["cluster_hclust_complete"]),as.matrix(raw_data["party"]))
hclust_complete_entropy=cluster.entropy(as.matrix(raw_data["cluster_hclust_complete"]),as.matrix(raw_data["party"]))
hclust_complete=c(hclust_complete_purity,hclust_complete_entropy)
hclust_average_purity=cluster.purity(as.matrix(raw_data["cluster_hclust_average"]),as.matrix(raw_data["party"]))
hclust_average_entropy=cluster.entropy(as.matrix(raw_data["cluster_hclust_average"]),as.matrix(raw_data["party"]))
hclust_average=c(hclust_average_purity,hclust_average_entropy)
dF <- data.frame("kmeans"=kmeans,"hclust_single"=hclust_single,"hclust_complete"=hclust_complete,"hclust_average"=hclust_average)
rownames(dF)= c("purity","entropy")
dF
## kmeans hclust_single hclust_complete hclust_average
## purity 0.9528302 0.5471698 0.9433962 0.5471698
## entropy 0.3039495 1.0984641 0.3473223 1.0984641